Install/Load Required Packages
We’ll use the tidyverse. In addition, we’ll use arule to implement association rules mining.
packages <- c("tidyverse", "haven", "arules", "plotly")
# Installing Packages
# install.packages(packages, update = TRUE, ask = FALSE)
# Loading Packages
for(package in packages) {
do.call("library", list(package))
}
Data Processing
We’ll first load the data from the NHANES website (data documentation) to a tibble. We’ll only look at prescriptions taken within the last month, and we’ll only need the patient identifier and drug name columns.
# Load data from NHANES into prescriptions_df
prescriptions_df <- read_xpt("https://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/RXQ_RX_J.XPT")
# Filter to prescriptions that were taken in the last month
# Select patients (SEQN) and drug name (RXXDRUG)
prescriptions_df <- prescriptions_df %>%
filter(RXDUSE == 1) %>%
select(SEQN, RXDDRUG)
# View dataframe
prescriptions_df
For arules to work, we’ll need to group the data by the patient identifier, and we’ll the drug names for each patients to be stored within a list in the tibble. As a final step, we removed all patients with no drugs.
prescriptions_df <- prescriptions_df %>%
filter(
RXDDRUG != "55555",
RXDDRUG != "77777",
RXDDRUG != "99999",
RXDDRUG != ""
) %>%
group_by(SEQN) %>%
summarize(RXDDRUG = list(unique(RXDDRUG)))
prescriptions_df
Mine Frequent Itemsets
Now we can implement association rules mining. First we’ll mine the frequent itemsets using the apriori algorithm. We’ll generate a list of itemsets with a support of at least 0.005.
frequent_itemsets <- apriori(
as(prescriptions_df$RXDDRUG, "transactions"),
parameter = list(supp=0.005, target="frequent itemsets")
)
Apriori
Parameter specification:
Algorithmic control:
Absolute minimum support count: 19
set item appearances ...[0 item(s)] done [0.00s].
set transactions ...[666 item(s), 3951 transaction(s)] done [0.00s].
sorting and recoding items ... [140 item(s)] done [0.00s].
creating transaction tree ... done [0.00s].
checking subsets of size 1 2 3 4 done [0.00s].
sorting transactions ... done [0.00s].
writing ... [360 set(s)] done [0.00s].
creating S4 object ... done [0.00s].
frequent_itemsets <- inspect(frequent_itemsets) %>%
arrange(desc(support))
frequent_itemsets
Next let’s look at association rules. From looking at the rules with the highest lift, we see that the insulin drugs are highly associated. What are other patterns that may be interesting?
rules <- apriori(
as(prescriptions_df$RXDDRUG, "transactions"),
parameter = list(supp=0.005, conf=0.005, target="rules")
)
Apriori
Parameter specification:
Algorithmic control:
Absolute minimum support count: 19
set item appearances ...[0 item(s)] done [0.00s].
set transactions ...[666 item(s), 3951 transaction(s)] done [0.00s].
sorting and recoding items ... [140 item(s)] done [0.00s].
creating transaction tree ... done [0.00s].
checking subsets of size 1 2 3 4 done [0.00s].
writing ... [618 rule(s)] done [0.00s].
creating S4 object ... done [0.00s].
rules <-data.frame(inspect(rules)) %>%
filter(
lift >= 1,
lhs != "{}",
rhs != "{}"
) %>%
select(-Var.2) %>%
arrange(desc(lift))
rules
rules %>%
filter(
support > 0.02 &
confidence > 0.6
) %>%
select(
lhs,
rhs
)
NA
Visualize Rules
We can plot the support and confidence (or lift) or each rule on a scatterplot.
ggplot(rules) +
aes(x=support, y=confidence, label=lhs, label2=rhs) +
geom_point()

ggplotly()
LS0tCnRpdGxlOiAiUiBOb3RlYm9vayIKb3V0cHV0OiBodG1sX25vdGVib29rCi0tLQoKIyBJbnN0YWxsL0xvYWQgUmVxdWlyZWQgUGFja2FnZXMKCldlJ2xsIHVzZSB0aGUgdGlkeXZlcnNlLiBJbiBhZGRpdGlvbiwgd2UnbGwgdXNlIFthcnVsZV0oaHR0cHM6Ly93d3cucmRvY3VtZW50YXRpb24ub3JnL3BhY2thZ2VzL2FydWxlcy92ZXJzaW9ucy8xLjYtOCkgdG8gaW1wbGVtZW50IGFzc29jaWF0aW9uIHJ1bGVzIG1pbmluZy4KCmBgYHtyIHNldHVwLCBtZXNzYWdlPUZBTFNFLCB3YXJuaW5nPUZBTFNFfQpwYWNrYWdlcyA8LSBjKCJ0aWR5dmVyc2UiLCAiaGF2ZW4iLCAiYXJ1bGVzIiwgInBsb3RseSIpCgojIEluc3RhbGxpbmcgUGFja2FnZXMKIyBpbnN0YWxsLnBhY2thZ2VzKHBhY2thZ2VzLCB1cGRhdGUgPSBUUlVFLCBhc2sgPSBGQUxTRSkgCiAgCiMgTG9hZGluZyBQYWNrYWdlcwpmb3IocGFja2FnZSBpbiBwYWNrYWdlcykgewogIGRvLmNhbGwoImxpYnJhcnkiLCBsaXN0KHBhY2thZ2UpKQp9CmBgYAoKIyBEYXRhIFByb2Nlc3NpbmcKCldlJ2xsIGZpcnN0IGxvYWQgdGhlIGRhdGEgZnJvbSB0aGUgTkhBTkVTIHdlYnNpdGUgKFtkYXRhIGRvY3VtZW50YXRpb25dKGh0dHBzOi8vd3d3bi5jZGMuZ292L05jaHMvTmhhbmVzLzIwMTctMjAxOC9SWFFfUlhfSi5odG0pKSB0byBhIHRpYmJsZS4gV2UnbGwgb25seSBsb29rIGF0IHByZXNjcmlwdGlvbnMgdGFrZW4gd2l0aGluIHRoZSBsYXN0IG1vbnRoLCBhbmQgd2UnbGwgb25seSBuZWVkIHRoZSBwYXRpZW50IGlkZW50aWZpZXIgYW5kIGRydWcgbmFtZSBjb2x1bW5zLgoKYGBge3IgZGF0YV9wcm9jZXNzaW5nfQojIExvYWQgZGF0YSBmcm9tIE5IQU5FUyBpbnRvIHByZXNjcmlwdGlvbnNfZGYKcHJlc2NyaXB0aW9uc19kZiA8LSByZWFkX3hwdCgiaHR0cHM6Ly93d3duLmNkYy5nb3YvTmNocy9OaGFuZXMvMjAxNy0yMDE4L1JYUV9SWF9KLlhQVCIpCgojIEZpbHRlciB0byBwcmVzY3JpcHRpb25zIHRoYXQgd2VyZSB0YWtlbiBpbiB0aGUgbGFzdCBtb250aAojIFNlbGVjdCBwYXRpZW50cyAoU0VRTikgYW5kIGRydWcgbmFtZSAoUlhYRFJVRykgCnByZXNjcmlwdGlvbnNfZGYgPC0gcHJlc2NyaXB0aW9uc19kZiAlPiUKICBmaWx0ZXIoUlhEVVNFID09IDEpICU+JSAKICBzZWxlY3QoU0VRTiwgUlhERFJVRykKCiMgVmlldyBkYXRhZnJhbWUKcHJlc2NyaXB0aW9uc19kZgpgYGAKCkZvciBhcnVsZXMgdG8gd29yaywgd2UnbGwgbmVlZCB0byBncm91cCB0aGUgZGF0YSBieSB0aGUgcGF0aWVudCBpZGVudGlmaWVyLCBhbmQgd2UnbGwgdGhlIGRydWcgbmFtZXMgZm9yIGVhY2ggcGF0aWVudHMgdG8gYmUgc3RvcmVkIHdpdGhpbiBhIGxpc3QgaW4gdGhlIHRpYmJsZS4gQXMgYSBmaW5hbCBzdGVwLCB3ZSByZW1vdmVkIGFsbCBwYXRpZW50cyB3aXRoIG5vIGRydWdzLgoKYGBge3IgZ3JvdXBfYnlfcGF0aWVudHN9CnByZXNjcmlwdGlvbnNfZGYgPC0gcHJlc2NyaXB0aW9uc19kZiAlPiUgCiAgZmlsdGVyKAogICAgUlhERFJVRyAhPSAiNTU1NTUiLAogICAgUlhERFJVRyAhPSAiNzc3NzciLAogICAgUlhERFJVRyAhPSAiOTk5OTkiLAogICAgUlhERFJVRyAhPSAiIgogICkgJT4lCiAgZ3JvdXBfYnkoU0VRTikgJT4lIAogIHN1bW1hcml6ZShSWEREUlVHID0gbGlzdCh1bmlxdWUoUlhERFJVRykpKQoKcHJlc2NyaXB0aW9uc19kZgpgYGAKCiMgTWluZSBGcmVxdWVudCBJdGVtc2V0cwoKTm93IHdlIGNhbiBpbXBsZW1lbnQgYXNzb2NpYXRpb24gcnVsZXMgbWluaW5nLiBGaXJzdCB3ZSdsbCBtaW5lIHRoZSBmcmVxdWVudCBpdGVtc2V0cyB1c2luZyB0aGUgW2FwcmlvcmkgYWxnb3JpdGhtXShodHRwczovL2VuLndpa2lwZWRpYS5vcmcvd2lraS9BcHJpb3JpX2FsZ29yaXRobSkuIFdlJ2xsIGdlbmVyYXRlIGEgbGlzdCBvZiBpdGVtc2V0cyB3aXRoIGEgc3VwcG9ydCBvZiBhdCBsZWFzdCAwLjAwNS4KCmBgYHtyIGZyZXF1ZW50X2l0ZW1zZXRzfQpmcmVxdWVudF9pdGVtc2V0cyA8LSBhcHJpb3JpKAogIGFzKHByZXNjcmlwdGlvbnNfZGYkUlhERFJVRywgInRyYW5zYWN0aW9ucyIpLCAKICBwYXJhbWV0ZXIgPSBsaXN0KHN1cHA9MC4wMDUsIHRhcmdldD0iZnJlcXVlbnQgaXRlbXNldHMiKQopCgpmcmVxdWVudF9pdGVtc2V0cyA8LSBpbnNwZWN0KGZyZXF1ZW50X2l0ZW1zZXRzKSAlPiUgCiAgYXJyYW5nZShkZXNjKHN1cHBvcnQpKQoKZnJlcXVlbnRfaXRlbXNldHMKYGBgCgpOZXh0IGxldCdzIGxvb2sgYXQgYXNzb2NpYXRpb24gcnVsZXMuIEZyb20gbG9va2luZyBhdCB0aGUgcnVsZXMgd2l0aCB0aGUgaGlnaGVzdCBsaWZ0LCB3ZSBzZWUgdGhhdCB0aGUgaW5zdWxpbiBkcnVncyBhcmUgaGlnaGx5IGFzc29jaWF0ZWQuIFdoYXQgYXJlIG90aGVyIHBhdHRlcm5zIHRoYXQgbWF5IGJlIGludGVyZXN0aW5nPwoKYGBge3IgYXNzb2NpYXRpb25fcnVsZXN9CnJ1bGVzIDwtIGFwcmlvcmkoCiAgYXMocHJlc2NyaXB0aW9uc19kZiRSWEREUlVHLCAidHJhbnNhY3Rpb25zIiksIAogIHBhcmFtZXRlciA9IGxpc3Qoc3VwcD0wLjAwNSwgY29uZj0wLjAwNSwgdGFyZ2V0PSJydWxlcyIpCikKCnJ1bGVzIDwtZGF0YS5mcmFtZShpbnNwZWN0KHJ1bGVzKSkgJT4lCiAgZmlsdGVyKAogICAgbGlmdCA+PSAxLAogICAgbGhzICE9ICJ7fSIsCiAgICByaHMgIT0gInt9IgogICkgJT4lCiAgc2VsZWN0KC1WYXIuMikgJT4lCiAgYXJyYW5nZShkZXNjKGxpZnQpKQoKcnVsZXMKYGBgCgpgYGB7cn0KcnVsZXMgJT4lCiAgZmlsdGVyKAogICAgc3VwcG9ydCA+IDAuMDIgJgogICAgY29uZmlkZW5jZSA+IDAuNgogICkgJT4lCiAgc2VsZWN0KAogICAgbGhzLAogICAgcmhzCiAgKQogIApgYGAKCiMgVmlzdWFsaXplIFJ1bGVzCgpXZSBjYW4gcGxvdCB0aGUgc3VwcG9ydCBhbmQgY29uZmlkZW5jZSAob3IgbGlmdCkgb3IgZWFjaCBydWxlIG9uIGEgc2NhdHRlcnBsb3QuCgpgYGB7cn0KZ2dwbG90KHJ1bGVzKSArCiAgYWVzKHg9c3VwcG9ydCwgeT1jb25maWRlbmNlLCBsYWJlbD1saHMsIGxhYmVsMj1yaHMpICsKICBnZW9tX3BvaW50KCkKCmdncGxvdGx5KCkKYGBgCg==